* change from hourlyweather_inc_clean_and_vargen_GMTadj_HS: combine ALL months before doing the averaging. Purpose: avoid gaps at end of month
* when launching this script, the directory should be the folder where the repository was extracted to (as indicated in master_revised.do)

tempfile scratch average
local Weathervars "PrecipTotal Temperature DewPoint Pressure WindSpeed RelativeHumidity" // dropping WindDirection and PressureSeaLevel, which I don't need
local weathervars "prcp temp dew press wind rh" // cf. Heyes & Saberian names: "precip temperature dewpoint airpressure windspeed humidity"

forvalues y=2000/2004 { // 2000/2004
forvalues m=1/12 {

	if `m'<10 local month = "0" + "`m'"
	else local month = "`m'"

	use data/original_article/data/Data/out/noaa_qclcd_hourly_`y'_`month', clear
	rename Station* *
	keep  `Weathervars' WBANNumber stata_time
	
	merge m:1 WBANNumber using data/original_article/data/Data/out/noaa_qclcd_station_timezones.dta, keep(3) nogenerate // can't assert(2 3) bc timezone missing for 14779 (2002/9), 52925 (2001/7-2003/2), 54974 (2003/9-2004/1), 99901(2000/1, 2001/5-8), 99900/2 (2001/6-9), multiple 995.. (2002/5-7) -- some of these (e.g., 52925) aren't even in NOAA master list (https://www.ncdc.noaa.gov/homr/file/wbanmasterlist.html.zip)
	replace stata_time = stata_time + (StationTimezone+inrange(`m',4,10))*1000*60*60 // "inrange" is (rough) adjustment for DST, which was 1st Sunday of April through last Sunday of October before 2007; multiplication is 1000 ms/s, 60 s/m, 60 m/h
	g int date=dofc(stata_time)
	format date %td
	generate byte hour=hh(stata_time)	
	
	rename (`Weathervars' WBANNumber)(`weathervars' wbannumber)
	collapse `weathervars', by(date hour wbannumber)
	compress
	
	cap append using `scratch'
	save `scratch', replace
 }
 }

duplicates drop // there could be duplicates because the first of the next month seems to be in some months' data sets, leading to duplicates that mess up merge etc.

preserve
	collapse `weathervars' (count) n=temp, by(date wbannumber)
	drop if n<24 // keep only complete measurements -- that's vast majority of wban-date combinations (80-100% depending on month)
	rename (`weathervars') avg_=
	save `average', replace
restore

keep if inrange(hour,6,15)
collapse `weathervars' (count) n=temp, by(date wbannumber)
drop if n<10
rename (`weathervars') =6t4
merge 1:1 date wbannumber using `average', keep(3) nogen // (1) or (2) could arise from earlier drops of date-wban's with too few hourly measurements

merge m:1 wbannumber using data/original_article/data/Data/out/noaa_qclcd_station_locations, nogen keep(3) keepusing(latitude longitude)
drop if mi(lati, longi)

keep date *6t4 avg_* latitude longitude wbannumber
compress

isid date latitude longitude
save data/original_article/reconstructed_data/hourlyweather_vargen_GMTadj_nogaps_HS.dta, replace